In [None]:
import pytz
import datetime
import marimo as mo

india_timezone = pytz.timezone("Asia/Kolkata")
now = datetime.datetime.now(india_timezone)

curr = now.strftime("%Y-%m-%d, %I:%M:%S %p %Z")

mo.md(
    rf"""
# Week - 2

**Submission Date:** `This assignment will not be graded and is only for practice.`

**Last Updated:** `{curr}`
"""
)

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("Week-2/Practice Assignment/dataset.csv")

In [None]:
df.info()
df.head()

### Question 1

How many samples are there in this dataset?

In [None]:
df.shape[0]

### Question 2

How many input features are there in this dataset?

In [None]:
df.shape[1] - 1

### Question 3

What type of problem is this?
> **Hint:** look at the target variable.

In [None]:
print("Classification")

### Question 4

Which of the features has least variance?
> **Hint:** Ignore/remove missing values if any. Are there any categorical features?

In [None]:
df.replace("?", np.nan).iloc[:, :4].apply(pd.to_numeric, errors="coerce").var(
    skipna=True
).idxmin()

### Question 5

Which of the following features has least number of outliers?
> **Hint:** Which visualization marks outliers? Ignore/remove missing values if any

In [None]:
import matplotlib.pyplot as plt

df2 = df.copy()
for _col in ["V1", "V2"]:
    df2[_col] = pd.to_numeric(df2[_col], errors="coerce")

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle("Box Plots for All Numerical Features", fontsize=16)

cols_to_plot = ["V1", "V2", "V3", "V4"]
for ax, _col in zip(axes.flatten(), cols_to_plot):
    ax.boxplot(df2[_col].dropna())
    ax.set_title(f"Box Plot of {_col}")
    ax.set_ylabel(_col)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
fig

In [None]:
outlier_counts = {}
for _col in ["V1", "V2", "V3", "V4"]:
    data_col = df2[_col].dropna()

    Q1 = data_col.quantile(0.25)
    Q3 = data_col.quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outlier_counts[_col] = (
        (data_col < lower_bound) | (data_col > upper_bound)
    ).sum()

print("Number of outliers in each feature:")
for feature, count in outlier_counts.items():
    print(f"\t- {feature}: {count}")

### Question 6

Which of the following is true about target variable?

- [ ] It has higher 'YES' entries than 'NO' entries.
- [x] It has higher 'NO' entries than 'YES' entries.
- [ ] It has same number of entries of 'YES' and 'NO'

In [None]:
(df["Target"] == "YES").sum(), (df["Target"] == "NO").sum()

### Question 7

How many features have missing values in the dataset?
> **Hint:** missing values are marked by '?'

In [None]:
((df == "?").sum() > 0).sum()

### Question 8

What is total number of missing values in the dataset?

In [None]:
(df == "?").sum().sum()

### Question 9

What is average of first feature (i.e. 'V1') after applyting SimpleImputer(strategy = 'median')?

In [None]:
from sklearn.impute import SimpleImputer

V1_series = pd.to_numeric(df["V1"].replace("?", np.nan), errors="coerce")

_imputer = SimpleImputer(strategy="median")
round(_imputer.fit_transform(V1_series.values.reshape(-1, 1)).mean(), 3)

### Question 10

What is average of first column after applyting KNNImputer(n_neighbors = 3)?

In [None]:
from sklearn.impute import KNNImputer

_imputer = KNNImputer(n_neighbors=3)
round(_imputer.fit_transform(V1_series.values.reshape(-1, 1)).mean(), 3)

### Question 11

What is average of second feature (i.e. 'V2') after applyting SimpleImputer(strategy = 'mean')?

In [None]:
V2_series = pd.to_numeric(df["V2"].replace("?", np.nan), errors="coerce")

_imputer = SimpleImputer(strategy="mean")
round(_imputer.fit_transform(V2_series.values.reshape(-1, 1)).mean(), 3)